Implementation of a new scheduler. Based on BVT (Borrowed Virtual Time) but trying to give more fair allocation of CPU for diverse environments (CPU-bound domains running against I/O bound ones). For that reason I called it Fair BVT (or FBVT for short). The BVT implementation gave the basic implementation. This changeset contains also the interface to control the scheduler. Unfortunatelly the contex switch allowance cannot be changed yet (a bug). The parameters introduced in the scheduler are likely to change in near future (after running tests).
3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/libxc/xc_domain.c
40278d99BLsfUv3qxv0I8C1sClZ0ow tools/libxc/xc_elf.h
403e0977Bjsm_e82pwvl9VvaJxh8Gg tools/libxc/xc_evtchn.c
+40ec1922Nq_Rur5KUH0MvRNKczPGxg tools/libxc/xc_fbvtsched.c
40e03333Eegw8czSWvHsbKxrRZJjRA tools/libxc/xc_io.c
40e03333vrWGbLAhyJjXlqCHaJt7eA tools/libxc/xc_io.h
3fbba6dbNCU7U6nsMYiXzKkp3ztaJg tools/libxc/xc_linux_build.c
3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c
4064773cJ31vZt-zhbSoxqft1Jaw0w xen/common/sched_atropos.c
40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
+40ec1922He_dRhVJdOicTcHvT8v1NQ xen/common/sched_fair_bvt.c
40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c
SRCS :=
SRCS += xc_atropos.c
SRCS += xc_bvtsched.c
+SRCS += xc_fbvtsched.c
SRCS += xc_domain.c
SRCS += xc_evtchn.c
SRCS += xc_io.c
unsigned long *warpl,
unsigned long *warpu);
+int xc_fbvtsched_global_set(int xc_handle,
+ unsigned long ctx_allow);
+
+int xc_fbvtsched_domain_set(int xc_handle,
+ u32 domid,
+ unsigned long mcuadv,
+ unsigned long warp,
+ unsigned long warpl,
+ unsigned long warpu);
+
+int xc_fbvtsched_global_get(int xc_handle,
+ unsigned long *ctx_allow);
+
+int xc_fbvtsched_domain_get(int xc_handle,
+ u32 domid,
+ unsigned long *mcuadv,
+ unsigned long *warp,
+ unsigned long *warpl,
+ unsigned long *warpu);
+
int xc_atropos_domain_set(int xc_handle,
u32 domid,
u64 period, u64 slice, u64 latency,
--- /dev/null
+/******************************************************************************
+ * xc_fbvtsched.c
+ *
+ * API for manipulating parameters of the Fair Borrowed Virtual Time scheduler.
+ *
+ * Copyright (c) 2004, G. Milos
+ * Based on K. Fraiser's xc_bvtsched.c
+ */
+
+#include "xc_private.h"
+
+int xc_fbvtsched_global_set(int xc_handle,
+ unsigned long ctx_allow)
+{
+ dom0_op_t op;
+
+ op.cmd = DOM0_SCHEDCTL;
+ op.u.schedctl.sched_id = SCHED_FBVT;
+ op.u.schedctl.direction = SCHED_INFO_PUT;
+ op.u.schedctl.u.fbvt.ctx_allow = ctx_allow;
+
+ return do_dom0_op(xc_handle, &op);
+}
+
+int xc_fbvtsched_global_get(int xc_handle,
+ unsigned long *ctx_allow)
+{
+ dom0_op_t op;
+ int ret;
+
+ op.cmd = DOM0_SCHEDCTL;
+ op.u.schedctl.sched_id = SCHED_FBVT;
+ op.u.schedctl.direction = SCHED_INFO_GET;
+
+ ret = do_dom0_op(xc_handle, &op);
+
+ *ctx_allow = op.u.schedctl.u.fbvt.ctx_allow;
+
+ return ret;
+}
+
+int xc_fbvtsched_domain_set(int xc_handle,
+ u32 domid,
+ unsigned long mcuadv,
+ unsigned long warp,
+ unsigned long warpl,
+ unsigned long warpu)
+{
+ dom0_op_t op;
+ struct fbvt_adjdom *fbvtadj = &op.u.adjustdom.u.fbvt;
+
+ op.cmd = DOM0_ADJUSTDOM;
+ op.u.adjustdom.domain = (domid_t)domid;
+ op.u.adjustdom.sched_id = SCHED_FBVT;
+ op.u.adjustdom.direction = SCHED_INFO_PUT;
+
+ fbvtadj->mcu_adv = mcuadv;
+ fbvtadj->warp = warp;
+ fbvtadj->warpl = warpl;
+ fbvtadj->warpu = warpu;
+ return do_dom0_op(xc_handle, &op);
+}
+
+
+int xc_fbvtsched_domain_get(int xc_handle,
+ u32 domid,
+ unsigned long *mcuadv,
+ unsigned long *warp,
+ unsigned long *warpl,
+ unsigned long *warpu)
+{
+
+ dom0_op_t op;
+ int ret;
+ struct fbvt_adjdom *adjptr = &op.u.adjustdom.u.fbvt;
+
+ op.cmd = DOM0_ADJUSTDOM;
+ op.u.adjustdom.domain = (domid_t)domid;
+ op.u.adjustdom.sched_id = SCHED_FBVT;
+ op.u.adjustdom.direction = SCHED_INFO_GET;
+
+ ret = do_dom0_op(xc_handle, &op);
+
+ *mcuadv = adjptr->mcu_adv;
+ *warp = adjptr->warp;
+ *warpl = adjptr->warpl;
+ *warpu = adjptr->warpu;
+ return ret;
+}
"warpu", warpu);
}
+static PyObject *pyxc_fbvtsched_global_set(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+
+ unsigned long ctx_allow;
+
+ static char *kwd_list[] = { "ctx_allow", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "l", kwd_list, &ctx_allow) )
+ return NULL;
+
+ if ( xc_fbvtsched_global_set(xc->xc_handle, ctx_allow) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+static PyObject *pyxc_fbvtsched_global_get(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+
+ unsigned long ctx_allow;
+
+ if ( !PyArg_ParseTuple(args, "") )
+ return NULL;
+
+ if ( xc_fbvtsched_global_get(xc->xc_handle, &ctx_allow) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ return Py_BuildValue("s:l", "ctx_allow", ctx_allow);
+}
+
+static PyObject *pyxc_fbvtsched_domain_set(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+
+ u32 dom;
+ unsigned long mcuadv, warp, warpl, warpu;
+
+ static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
+ "warpu", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "illll", kwd_list,
+ &dom, &mcuadv, &warp, &warpl, &warpu) )
+ return NULL;
+
+ if ( xc_fbvtsched_domain_set(xc->xc_handle, dom, mcuadv,
+ warp, warpl, warpu) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+static PyObject *pyxc_fbvtsched_domain_get(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+ u32 dom;
+ unsigned long mcuadv, warp, warpl, warpu;
+
+ static char *kwd_list[] = { "dom", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i", kwd_list, &dom) )
+ return NULL;
+
+ if ( xc_fbvtsched_domain_get(xc->xc_handle, dom, &mcuadv, &warp,
+ &warpl, &warpu) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ return Py_BuildValue("{s:i,s:l,s:l,s:l,s:l}",
+ "domain", dom,
+ "mcuadv", mcuadv,
+ "warp", warp,
+ "warpl", warpl,
+ "warpu", warpu);
+}
+
static PyObject *pyxc_evtchn_bind_interdomain(PyObject *self,
PyObject *args,
PyObject *kwds)
" warpl [long]: Warp limit,\n"
},
+ { "fbvtsched_global_set",
+ (PyCFunction)pyxc_fbvtsched_global_set,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set global tuning parameters for Fair Borrowed Virtual Time scheduler.\n"
+ " ctx_allow [int]: Minimal guaranteed quantum.\n\n"
+ "Returns: [int] 0 on success; -1 on error.\n" },
+
+ { "fbvtsched_global_get",
+ (PyCFunction)pyxc_fbvtsched_global_get,
+ METH_KEYWORDS, "\n"
+ "Get global tuning parameters for FBVT scheduler.\n"
+ "Returns: [dict]:\n"
+ " ctx_allow [int]: context switch allowance\n" },
+
+ { "fbvtsched_domain_set",
+ (PyCFunction)pyxc_fbvtsched_domain_set,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set per-domain tuning parameters for Fair Borrowed Virtual Time scheduler.\n"
+ " dom [int]: Identifier of domain to be tuned.\n"
+ " mcuadv [int]: Proportional to the inverse of the domain's weight.\n"
+ " warp [int]: How far to warp domain's EVT on unblock.\n"
+ " warpl [int]: How long the domain can run warped.\n"
+ " warpu [int]: How long before the domain can warp again.\n\n"
+ "Returns: [int] 0 on success; -1 on error.\n" },
+
+ { "fbvtsched_domain_get",
+ (PyCFunction)pyxc_fbvtsched_domain_get,
+ METH_KEYWORDS, "\n"
+ "Get per-domain tuning parameters under the FBVT scheduler.\n"
+ " dom [int]: Identifier of domain to be queried.\n"
+ "Returns [dict]:\n"
+ " domain [int]: Domain ID.\n"
+ " mcuadv [long]: MCU Advance.\n"
+ " warp [long]: Warp.\n"
+ " warpu [long]: Unwarp requirement.\n"
+ " warpl [long]: Warp limit,\n"
+ },
+
{ "atropos_domain_set",
(PyCFunction)pyxc_atropos_domain_set,
METH_KEYWORDS, "\n"
{'op' : 'cpu_rrobin_slice_set',
'slice' : slice })
- def xend_node_cpu_bvt_slice_set(self, slice):
+ def xend_node_cpu_bvt_slice_set(self, ctx_allow):
return xend_call(self.nodeurl(),
{'op' : 'cpu_bvt_slice_set',
- 'slice' : slice })
+ 'ctx_allow' : ctx_allow })
+
+ def xend_node_cpu_fbvt_slice_set(self, ctx_allow):
+ return xend_call(self.nodeurl(),
+ {'op' : 'cpu_fbvt_slice_set',
+ 'ctx_allow' : ctx_allow })
def xend_domains(self):
return xend_get(self.domainurl())
def xend_domain_cpu_bvt_set(self, id, mcuadv, warp, warpl, warpu):
return xend_call(self.domainurl(id),
{'op' : 'cpu_bvt_set',
- 'mcuadv' : mvuadv,
+ 'mcuadv' : mcuadv,
'warp' : warp,
'warpl' : warpl,
'warpu' : warpu })
+
+ def xend_domain_cpu_fbvt_set(self, id, mcuadv, warp, warpl, warpu):
+ return xend_call(self.domainurl(id),
+ {'op' : 'cpu_fbvt_set',
+ 'mcuadv' : mcuadv,
+ 'warp' : warp,
+ 'warpl' : warpl,
+ 'warpu' : warpu })
+
def xend_domain_cpu_atropos_set(self, id, period, slice, latency, xtratime):
return xend_call(self.domainurl(id),
dom = int(dom)
return xc.bvtsched_domain_get(dom)
+ def domain_cpu_fbvt_set(self, dom, mcuadv, warp, warpl, warpu):
+ """Set FBVT (Fair Borrowed Virtual Time) scheduler parameters for a domain.
+ """
+ dom = int(dom)
+ return xc.fbvtsched_domain_set(dom=dom, mcuadv=mcuadv,
+ warp=warp, warpl=warpl, warpu=warpu)
+
+ def domain_cpu_fbvt_get(self, dom):
+ """Get FBVT (Fair Borrowed Virtual Time) scheduler parameters for a domain.
+ """
+ dom = int(dom)
+ return xc.fbvtsched_domain_get(dom)
+
def domain_cpu_atropos_set(self, dom, period, slice, latency, xtratime):
"""Set Atropos scheduler parameters for a domain.
"""
def notify(self, uri):
return 0
- def cpu_bvt_slice_set(self, slice):
+ def cpu_bvt_slice_set(self, ctx_allow):
ret = 0
#ret = self.xc.bvtsched_global_set(ctx_allow=slice)
return ret
- def cpu_bvt_slice_get(self, slice):
+ def cpu_bvt_slice_get(self, ctx_allow):
+ ret = 0
+ #ret = self.xc.bvtsched_global_get()
+ return ret
+
+ def cpu_fbvt_slice_set(self, ctx_allow):
+ ret = 0
+ #ret = self.xc.bvtsched_global_set(ctx_allow=slice)
+ return ret
+
+ def cpu_fbvt_slice_get(self, ctx_allow):
ret = 0
#ret = self.xc.bvtsched_global_get()
return ret
['warpu', 'int']])
val = fn(req.args, {'dom': self.dom.id})
return val
+
+ def op_cpu_fbvt_set(self, op, req):
+ fn = FormFn(self.xd.domain_cpu_fbvt_set,
+ [['dom', 'int'],
+ ['mcuadv', 'int'],
+ ['warp', 'int'],
+ ['warpl', 'int'],
+ ['warpu', 'int']])
+ val = fn(req.args, {'dom': self.dom.id})
+ return val
def op_cpu_atropos_set(self, op, req):
fn = FormFn(self.xd.domain_cpu_atropos_set,
from SrvDir import SrvDir
from xen.xend import sxp
from xen.xend import XendNode
+from xen.xend.Args import FormFn
class SrvNode(SrvDir):
"""Information about the node.
def op_cpu_bvt_slice_set(self, op, req):
fn = FormFn(self.xn.cpu_bvt_slice_set,
- [['slice', 'int']])
+ [['ctx_allow', 'int']])
+ val = fn(req.args, {})
+ return val
+
+ def op_cpu_fbvt_slice_set(self, op, req):
+ fn = FormFn(self.xn.cpu_fbvt_slice_set,
+ [['ctx_allow', 'int']])
val = fn(req.args, {})
return val
class ProgBvtslice(Prog):
group = 'scheduler'
- name = "bvtslice"
- info = """Set the BVT scheduler slice."""
+ name = "bvt_ctxallow"
+ info = """Set the BVT scheduler context switch allowance."""
def help(self, args):
- print args[0], 'SLICE'
- print '\nSet Borrowed Virtual Time scheduler slice.'
+ print args[0], 'CTX_ALLOW'
+ print '\nSet Borrowed Virtual Time scheduler context switch allowance.'
def main(self, args):
- if len(args) < 2: self.err('%s: Missing slice' % args[0])
+ if len(args) < 2: self.err('%s: Missing context switch allowance'
+ % args[0])
server.xend_node_cpu_bvt_slice_set(slice)
xm.prog(ProgBvtslice)
+class ProgFbvt(Prog):
+ group = 'scheduler'
+ name = "fbvt"
+ info = """Set FBVT scheduler parameters."""
+
+ def help(self, args):
+ print args[0], "DOM MCUADV WARP WARPL WARPU"
+ print '\nSet Fair Borrowed Virtual Time scheduler parameters.'
+
+ def main(self, args):
+ if len(args) != 6: self.err("%s: Invalid argument(s)" % args[0])
+ v = map(int, args[1:6])
+ server.xend_domain_cpu_fbvt_set(*v)
+
+xm.prog(ProgFbvt)
+
+class ProgFbvtslice(Prog):
+ group = 'scheduler'
+ name = "fbvt_ctxallow"
+ info = """Set the FBVT scheduler context switch allowance."""
+
+ def help(self, args):
+ print args[0], 'CTX_ALLOW'
+ print '\nSet Fair Borrowed Virtual Time scheduler context switch allowance.'
+
+ def main(self, args):
+ if len(args) < 2: self.err('%s: Missing context switch allowance.'
+ % args[0])
+ server.xend_node_cpu_fbvt_slice_set(slice)
+
+xm.prog(ProgFbvtslice)
+
+
class ProgAtropos(Prog):
group = 'scheduler'
name= "atropos"
next->min_slice = ctx_allow;
ret.task = next;
ret.time = r_time;
-
return ret;
}
--- /dev/null
+/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004 - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ * File: common/schedule.c
+ * Author: Rolf Neugebauer & Keir Fraser
+ * Updated for generic API by Mark Williamson
+ *
+ * Description: CPU scheduling
+ * implements A Borrowed Virtual Time scheduler.
+ * (see Duda & Cheriton SOSP'99)
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/ac_timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/slab.h>
+
+
+/* all per-domain BVT-specific scheduling info is stored here */
+struct fbvt_dom_info
+{
+ unsigned long mcu_advance; /* inverse of weight */
+ u32 avt; /* actual virtual time */
+ u32 evt; /* effective virtual time */
+ u32 time_slept; /* records amount of time slept, used for scheduling */
+ u32 vtb; /* virtual time bonus */
+ int warpback; /* warp? */
+ long warp; /* virtual time warp */
+ long warpl; /* warp limit */
+ long warpu; /* unwarp time requirement */
+ s_time_t warped; /* time it ran warped last time */
+ s_time_t uwarped; /* time it ran unwarped last time */
+};
+
+struct fbvt_cpu_info
+{
+ unsigned long svt; /* XXX check this is unsigned long! */
+};
+
+
+#define FBVT_INFO(p) ((struct fbvt_dom_info *)(p)->sched_priv)
+#define CPU_INFO(cpu) ((struct fbvt_cpu_info *)(schedule_data[cpu]).sched_priv)
+#define CPU_SVT(cpu) (CPU_INFO(cpu)->svt)
+
+#define MCU (s32)MICROSECS(100) /* Minimum unit */
+#define MCU_ADVANCE 10 /* default weight */
+#define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
+static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
+
+/* SLAB cache for struct fbvt_dom_info objects */
+static kmem_cache_t *dom_info_cache;
+
+/*
+ * Calculate the effective virtual time for a domain. Take into account
+ * warping limits
+ */
+static void __calc_evt(struct fbvt_dom_info *inf)
+{
+ s_time_t now = NOW();
+
+ if ( inf->warpback )
+ {
+ if ( ((now - inf->warped) < inf->warpl) &&
+ ((now - inf->uwarped) > inf->warpu) )
+ {
+ /* allowed to warp */
+ inf->evt = inf->avt - inf->warp;
+ }
+ else
+ {
+ /* warped for too long -> unwarp */
+ inf->evt = inf->avt;
+ inf->uwarped = now;
+ inf->warpback = 0;
+ }
+ }
+ else
+ {
+ inf->evt = inf->avt;
+ }
+}
+
+/**
+ * fbvt_alloc_task - allocate FBVT private structures for a task
+ * @p: task to allocate private structures for
+ *
+ * Returns non-zero on failure.
+ */
+int fbvt_alloc_task(struct domain *p)
+{
+ p->sched_priv = kmem_cache_alloc(dom_info_cache);
+ if ( p->sched_priv == NULL )
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Add and remove a domain
+ */
+void fbvt_add_task(struct domain *p)
+{
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+ ASSERT(inf != NULL);
+ ASSERT(p != NULL);
+
+ inf->mcu_advance = MCU_ADVANCE;
+ if ( p->domain == IDLE_DOMAIN_ID )
+ {
+ inf->avt = inf->evt = ~0U;
+ }
+ else
+ {
+ /* Set avt and evt to system virtual time. */
+ inf->avt = CPU_SVT(p->processor);
+ inf->evt = CPU_SVT(p->processor);
+ /* Set some default values here. */
+ inf->vtb = 0;
+ inf->time_slept = 0;
+ inf->warpback = 0;
+ inf->warp = 0;
+ inf->warpl = 0;
+ inf->warpu = 0;
+ }
+
+ return;
+}
+
+/**
+ * fbvt_free_task - free FBVT private structures for a task
+ * @p: task
+ */
+void fbvt_free_task(struct domain *p)
+{
+ ASSERT( p->sched_priv != NULL );
+ kmem_cache_free( dom_info_cache, p->sched_priv );
+}
+
+
+void fbvt_wake_up(struct domain *p)
+{
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+ s32 io_warp;
+
+ ASSERT(inf != NULL);
+
+
+ /* set the BVT parameters */
+ if (inf->avt < CPU_SVT(p->processor))
+ {
+ /*
+ *We want IO bound processes to gain
+ *dispatch precedence. It is especially for
+ *device driver domains. Therefore AVT should not be updated
+ *to SVT but to a value marginally smaller.
+ *Since frequently sleeping domains have high time_slept
+ *values, the virtual time can be determined as:
+ *SVT - const * TIME_SLEPT
+ */
+
+ io_warp = (int)(0.5 * inf->time_slept);
+ if(io_warp > 10000) io_warp = 10000;
+
+ ASSERT(inf->time_slept + CPU_SVT(p->processor) > inf->avt + io_warp);
+ inf->time_slept += CPU_SVT(p->processor) - inf->avt - io_warp;
+ inf->avt = CPU_SVT(p->processor) - io_warp;
+ }
+
+ /* deal with warping here */
+ inf->warpback = 1;
+ inf->warped = NOW();
+ __calc_evt(inf);
+ __add_to_runqueue_head(p);
+}
+
+/*
+ * Block the currently-executing domain until a pertinent event occurs.
+ */
+static void fbvt_do_block(struct domain *p)
+{
+ FBVT_INFO(p)->warpback = 0;
+}
+
+/* Control the scheduler. */
+int fbvt_ctl(struct sched_ctl_cmd *cmd)
+{
+ struct fbvt_ctl *params = &cmd->u.fbvt;
+
+ if ( cmd->direction == SCHED_INFO_PUT )
+ {
+ ctx_allow = params->ctx_allow;
+ }
+ else
+ {
+ params->ctx_allow = ctx_allow;
+ }
+
+ return 0;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+int fbvt_adjdom(struct domain *p,
+ struct sched_adjdom_cmd *cmd)
+{
+ struct fbvt_adjdom *params = &cmd->u.fbvt;
+ unsigned long flags;
+
+ if ( cmd->direction == SCHED_INFO_PUT )
+ {
+ unsigned long mcu_adv = params->mcu_adv,
+ warp = params->warp,
+ warpl = params->warpl,
+ warpu = params->warpu;
+
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+ DPRINTK("Get domain %u fbvt mcu_adv=%ld, warp=%ld, "
+ "warpl=%ld, warpu=%ld\n",
+ p->domain, inf->mcu_advance, inf->warp,
+ inf->warpl, inf->warpu );
+
+ /* Sanity -- this can avoid divide-by-zero. */
+ if ( mcu_adv == 0 )
+ return -EINVAL;
+
+ spin_lock_irqsave(&schedule_lock[p->processor], flags);
+ inf->mcu_advance = mcu_adv;
+ inf->warp = warp;
+ inf->warpl = warpl;
+ inf->warpu = warpu;
+
+ DPRINTK("Set domain %u fbvt mcu_adv=%ld, warp=%ld, "
+ "warpl=%ld, warpu=%ld\n",
+ p->domain, inf->mcu_advance, inf->warp,
+ inf->warpl, inf->warpu );
+
+ spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+ }
+ else if ( cmd->direction == SCHED_INFO_GET )
+ {
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+ spin_lock_irqsave(&schedule_lock[p->processor], flags);
+ params->mcu_adv = inf->mcu_advance;
+ params->warp = inf->warp;
+ params->warpl = inf->warpl;
+ params->warpu = inf->warpu;
+ spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+ }
+
+ return 0;
+}
+
+
+/*
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ * i.e., the domain with lowest EVT.
+ * The runqueue should be ordered by EVT so that is easy.
+ */
+static task_slice_t fbvt_do_schedule(s_time_t now)
+{
+ struct domain *prev = current, *next = NULL, *next_prime, *p;
+ struct list_head *tmp;
+ int cpu = prev->processor;
+ s32 r_time; /* time for new dom to run */
+ s32 ranfor; /* assume we never run longer than 2.1s! */
+ s32 mcus;
+ u32 next_evt, next_prime_evt, min_avt;
+ struct fbvt_dom_info *prev_inf = FBVT_INFO(prev),
+ *p_inf = NULL,
+ *next_inf = NULL,
+ *next_prime_inf = NULL;
+ task_slice_t ret;
+
+ ASSERT(prev->sched_priv != NULL);
+ ASSERT(prev_inf != NULL);
+
+ if ( likely(!is_idle_task(prev)) )
+ {
+ ranfor = (s32)(now - prev->lastschd);
+ /* Calculate mcu and update avt. */
+ mcus = (ranfor + MCU - 1) / MCU;
+ if(mcus * prev_inf->mcu_advance < prev_inf->vtb)
+ {
+ ASSERT(prev_inf->time_slept >= mcus * prev_inf->mcu_advance);
+ prev_inf->time_slept -= mcus * prev_inf->mcu_advance;
+ }
+ else
+ {
+ prev_inf->avt += mcus * prev_inf->mcu_advance - prev_inf->vtb;
+
+ ASSERT(prev_inf->time_slept >= prev_inf->vtb);
+ prev_inf->time_slept -= prev_inf->vtb;
+ }
+
+ __calc_evt(prev_inf);
+
+ __del_from_runqueue(prev);
+
+ if ( domain_runnable(prev) )
+ __add_to_runqueue_tail(prev);
+ }
+
+ /* We should at least have the idle task */
+ ASSERT(!list_empty(&schedule_data[cpu].runqueue));
+
+ /*
+ * scan through the run queue and pick the task with the lowest evt
+ * *and* the task the second lowest evt.
+ * this code is O(n) but we expect n to be small.
+ */
+ next = schedule_data[cpu].idle;
+ next_prime = NULL;
+
+ next_evt = ~0U;
+ next_prime_evt = ~0U;
+ min_avt = ~0U;
+
+ list_for_each ( tmp, &schedule_data[cpu].runqueue )
+ {
+ p = list_entry(tmp, struct domain, run_list);
+ p_inf = FBVT_INFO(p);
+
+ if ( p_inf->evt < next_evt )
+ {
+ next_prime = next;
+ next_prime_evt = next_evt;
+ next = p;
+ next_evt = p_inf->evt;
+ }
+ else if ( next_prime_evt == ~0U )
+ {
+ next_prime_evt = p_inf->evt;
+ next_prime = p;
+ }
+ else if ( p_inf->evt < next_prime_evt )
+ {
+ next_prime_evt = p_inf->evt;
+ next_prime = p;
+ }
+
+ /* Determine system virtual time. */
+ if ( p_inf->avt < min_avt )
+ min_avt = p_inf->avt;
+ }
+
+ /* Update system virtual time. */
+ if ( min_avt != ~0U )
+ CPU_SVT(cpu) = min_avt;
+
+ /* check for virtual time overrun on this cpu */
+ if ( CPU_SVT(cpu) >= 0xf0000000 )
+ {
+ u_long t_flags;
+ write_lock_irqsave(&tasklist_lock, t_flags);
+ for_each_domain ( p )
+ {
+ if ( p->processor == cpu )
+ {
+ p_inf = FBVT_INFO(p);
+ p_inf->evt -= 0xe0000000;
+ p_inf->avt -= 0xe0000000;
+ }
+ }
+ write_unlock_irqrestore(&tasklist_lock, t_flags);
+ CPU_SVT(cpu) -= 0xe0000000;
+ }
+
+ next_prime_inf = FBVT_INFO(next_prime);
+ next_inf = FBVT_INFO(next);
+
+ /* check for time_slept overrun for the domain we schedule to run*/
+ if(next_inf->time_slept >= 0xf0000000)
+ {
+ printk("Domain %d is assigned more CPU then it is able to use.\n"
+ "FBVT slept_time=%d, halving. Mcu_advance=%ld\n",next->domain,
+ next_inf->time_slept, next_inf->mcu_advance);
+
+ next_inf->time_slept /= 2;
+ }
+
+
+ /*
+ * In here we decide on Virtual Time Bonus. The idea is, for the
+ * domains that have large time_slept values to be allowed to run
+ * for longer. Thus regaining the share of CPU originally allocated.
+ * This is acompanied by the warp mechanism (which moves IO-bound
+ * domains earlier in virtual time). Together this should give quite
+ * good control both for CPU and IO-bound domains.
+ */
+ next_inf->vtb = (int)(0.2 * next_inf->time_slept);
+ if(next_inf->vtb > 1000) next_inf->vtb = 1000;
+
+
+ /* work out time for next run through scheduler */
+ if ( is_idle_task(next) )
+ {
+ r_time = ctx_allow;
+ goto sched_done;
+ }
+
+ if ( (next_prime == NULL) || is_idle_task(next_prime) )
+ {
+ /* We have only one runnable task besides the idle task. */
+ r_time = 10 * ctx_allow; /* RN: random constant */
+ goto sched_done;
+ }
+
+ /*
+ * If we are here then we have two runnable tasks.
+ * Work out how long 'next' can run till its evt is greater than
+ * 'next_prime's evt. Take context switch allowance into account.
+ */
+ ASSERT(next_prime_inf->evt >= next_inf->evt);
+
+ r_time = ((next_prime_inf->evt + next_inf->vtb - next_inf->evt)/next_inf->mcu_advance)
+ + ctx_allow;
+
+ ASSERT(r_time >= ctx_allow);
+
+ sched_done:
+ next->min_slice = ctx_allow;
+ ret.task = next;
+ ret.time = r_time;
+ return ret;
+}
+
+
+static void fbvt_dump_runq_el(struct domain *p)
+{
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+ printk("mcua=0x%04lX ev=0x%08X av=0x%08X sl=0x%08X vtb=0x%08X ",
+ inf->mcu_advance, inf->evt, inf->avt, inf->time_slept, inf->vtb);
+}
+
+static void fbvt_dump_settings(void)
+{
+ printk("FBVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
+}
+
+static void fbvt_dump_cpu_state(int i)
+{
+ printk("svt=0x%08lX ", CPU_SVT(i));
+}
+
+
+/* Initialise the data structures. */
+int fbvt_init_scheduler()
+{
+ int i;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ schedule_data[i].sched_priv = kmalloc(sizeof(struct fbvt_cpu_info));
+ if ( schedule_data[i].sched_priv == NULL )
+ {
+ printk("Failed to allocate FBVT scheduler per-CPU memory!\n");
+ return -1;
+ }
+
+ CPU_SVT(i) = 0; /* XXX do I really need to do this? */
+ }
+
+ dom_info_cache = kmem_cache_create("FBVT dom info",
+ sizeof(struct fbvt_dom_info),
+ 0, 0, NULL, NULL);
+
+ if ( dom_info_cache == NULL )
+ {
+ printk("FBVT: Failed to allocate domain info SLAB cache");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void fbvt_pause(struct domain *p)
+{
+ if( __task_on_runqueue(p) )
+ {
+ __del_from_runqueue(p);
+ }
+}
+
+static void fbvt_unpause(struct domain *p)
+{
+ struct fbvt_dom_info *inf = FBVT_INFO(p);
+
+ if ( p->domain == IDLE_DOMAIN_ID )
+ {
+ inf->avt = inf->evt = ~0U;
+ }
+ else
+ {
+ /* Set avt to system virtual time. */
+ inf->avt = CPU_SVT(p->processor);
+ /* Set some default values here. */
+ inf->vtb = 0;
+ __calc_evt(inf);
+ }
+}
+
+struct scheduler sched_fbvt_def = {
+ .name = "Fair Borrowed Virtual Time",
+ .opt_name = "fbvt",
+ .sched_id = SCHED_FBVT,
+
+ .init_scheduler = fbvt_init_scheduler,
+ .alloc_task = fbvt_alloc_task,
+ .add_task = fbvt_add_task,
+ .free_task = fbvt_free_task,
+ .wake_up = fbvt_wake_up,
+ .do_block = fbvt_do_block,
+ .do_schedule = fbvt_do_schedule,
+ .control = fbvt_ctl,
+ .adjdom = fbvt_adjdom,
+ .dump_settings = fbvt_dump_settings,
+ .dump_cpu_state = fbvt_dump_cpu_state,
+ .dump_runq_el = fbvt_dump_runq_el,
+ .pause = fbvt_pause,
+ .unpause = fbvt_unpause,
+};
+
* TODO: It would be nice if the schedulers array could get populated
* automagically without having to hack the code in here.
*/
-extern struct scheduler sched_bvt_def, sched_rrobin_def, sched_atropos_def;
+extern struct scheduler sched_bvt_def, sched_fbvt_def, sched_rrobin_def, sched_atropos_def;
static struct scheduler *schedulers[] = { &sched_bvt_def,
+ &sched_fbvt_def,
&sched_rrobin_def,
&sched_atropos_def,
NULL};
spin_unlock_irqrestore(&schedule_lock[cpu], flags);
}
+/*
+ * Pausing a domain.
+ */
+void pause_domain(struct domain *domain)
+{
+ domain_sleep(domain);
+ SCHED_OP(pause, domain);
+}
+
+
+/*
+ * Unpauseing a domain
+ */
+void unpause_domain(struct domain *domain)
+{
+ SCHED_OP(unpause, domain);
+ domain_wake(domain);
+}
+
/* Block the currently-executing domain until a pertinent event occurs. */
long do_block(void)
{
rem_ac_timer(&schedule_data[cpu].s_timer);
ASSERT(!in_irq());
+if(!__task_on_runqueue(prev)) printk("Domain %d not on runqueue\n",prev->domain);
ASSERT(__task_on_runqueue(prev));
if ( test_bit(DF_BLOCKED, &prev->flags) )
/* Scheduler types */
#define SCHED_BVT 0
-#define SCHED_ATROPOS 1
-#define SCHED_RROBIN 2
+#define SCHED_FBVT 1
+#define SCHED_ATROPOS 2
+#define SCHED_RROBIN 3
/* these describe the intended direction used for a scheduler control or domain
* command */
u32 ctx_allow; /* 8: context switch allowance */
} PACKED bvt;
+ struct fbvt_ctl
+ {
+ /* IN variables. */
+ u32 ctx_allow; /* 8: context switch allowance */
+ } PACKED fbvt;
+
struct rrobin_ctl
{
/* IN variables */
u32 warpu; /* 28: unwarp time requirement */
} PACKED bvt;
+ struct fbvt_adjdom
+ {
+ u32 mcu_adv; /* 16: mcu advance: inverse of weight */
+ u32 warp; /* 20: time warp */
+ u32 warpl; /* 24: warp limit */
+ u32 warpu; /* 28: unwarp time requirement */
+ } PACKED fbvt;
+
struct atropos_adjdom
{
u64 nat_period; /* 16 */
void (*dump_runq_el) (struct domain *);
int (*prn_state) (int);
void (*pause) (struct domain *);
+ void (*unpause) (struct domain *);
};
/* per CPU scheduler information */
void init_idle_task(void);
void domain_wake(struct domain *d);
void domain_sleep(struct domain *d);
+void pause_domain(struct domain *d);
+void unpause_domain(struct domain *d);
void __enter_scheduler(void);
{
ASSERT(d != current);
atomic_inc(&d->pausecnt);
- domain_sleep(d);
+ pause_domain(d);
}
static inline void domain_unpause(struct domain *d)
{
ASSERT(d != current);
if ( atomic_dec_and_test(&d->pausecnt) )
- domain_wake(d);
+ unpause_domain(d);
}
static inline void domain_unblock(struct domain *d)
{
ASSERT(d != current);
if ( !test_and_set_bit(DF_CTRLPAUSE, &d->flags) )
- domain_sleep(d);
+ pause_domain(d);
}
static inline void domain_unpause_by_systemcontroller(struct domain *d)
{
if ( test_and_clear_bit(DF_CTRLPAUSE, &d->flags) )
- domain_wake(d);
+ unpause_domain(d);
}